{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "collapsed": false
   },
   "source": [
    "<h1 align=\"center\"> Word Count of Pride and Prejudice using PySpark </h1>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import re, string\n",
    "\n",
    "text_file = sc.textFile('Data/Pride_and_Prejudice.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[u'PRIDE AND PREJUDICE', u'', u'By Jane Austen', u'', u'']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# first 5 elements of the RDD\n",
    "text_file.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "unicode"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(u'lama')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^_`{|}~'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "string.punctuation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "punc = '!\"#$%&\\'()*+,./:;<=>?@[\\\\]^_`{|}~'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def uni_to_clean_str(x):\n",
    "    converted = x.encode('utf-8')\n",
    "    lowercased_str = converted.lower()\n",
    "    # for more difficult cases use re.split(' A|B')\n",
    "    lowercased_str = lowercased_str.replace('--',' ')\n",
    "    clean_str = lowercased_str.translate(None, punc) #Change 1\n",
    "    return clean_str"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('pardon', 16),\n",
       " ('expostulation', 1),\n",
       " ('desirable', 11),\n",
       " ('bathing-place', 1),\n",
       " ('four', 33),\n",
       " ('brightening', 1),\n",
       " ('straws', 1),\n",
       " ('sleep', 4),\n",
       " ('mansion', 2),\n",
       " ('appetite', 2),\n",
       " ('hate', 9),\n",
       " ('looking', 29),\n",
       " ('unfeeling', 3),\n",
       " ('reproofs', 2),\n",
       " ('sweetest', 2)]"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n",
    "one_RDD = one_RDD.map(lambda x: (x,1))\n",
    "one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n",
    "one_RDD.take(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pride and prejudice', '', 'by jane austen', '', '']"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_RDD = text_file.map(lambda x: uni_to_clean_str(x))\n",
    "one_RDD.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[['pride', 'and', 'prejudice'], [], ['by', 'jane', 'austen'], [], []]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_RDD = text_file.map(lambda x: uni_to_clean_str(x).split())\n",
    "one_RDD.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['pride', 'and', 'prejudice', 'by', 'jane']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n",
    "one_RDD.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('pride', 1), ('and', 1), ('prejudice', 1), ('by', 1), ('jane', 1)]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n",
    "one_RDD = one_RDD.map(lambda x: (x,1))\n",
    "one_RDD.take(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('pardon', 16),\n",
       " ('expostulation', 1),\n",
       " ('desirable', 11),\n",
       " ('bathing-place', 1),\n",
       " ('four', 33),\n",
       " ('brightening', 1),\n",
       " ('straws', 1),\n",
       " ('sleep', 4),\n",
       " ('mansion', 2),\n",
       " ('appetite', 2),\n",
       " ('hate', 9),\n",
       " ('looking', 29),\n",
       " ('unfeeling', 3),\n",
       " ('reproofs', 2),\n",
       " ('sweetest', 2)]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n",
    "one_RDD = one_RDD.map(lambda x: (x,1))\n",
    "one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n",
    "one_RDD.take(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[(4331, 'the'),\n",
       " (4138, 'to'),\n",
       " (3611, 'of'),\n",
       " (3578, 'and'),\n",
       " (2225, 'her'),\n",
       " (2069, 'i'),\n",
       " (1947, 'a'),\n",
       " (1866, 'in'),\n",
       " (1846, 'was'),\n",
       " (1710, 'she'),\n",
       " (1579, 'that'),\n",
       " (1535, 'it'),\n",
       " (1429, 'not'),\n",
       " (1357, 'you'),\n",
       " (1336, 'he')]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "one_RDD = text_file.flatMap(lambda x: uni_to_clean_str(x).split())\n",
    "one_RDD = one_RDD.map(lambda x: (x,1)) \n",
    "one_RDD = one_RDD.reduceByKey(lambda x,y: x + y)\n",
    "one_RDD.take(5)\n",
    "one_RDD = one_RDD.map(lambda x:(x[1],x[0])) \n",
    "one_RDD.take(5)\n",
    "one_RDD.sortByKey(False).take(15)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.11"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
